Nous avons réparti le travail au sein du groupe en fonction des intérêts et des compétences de chacun. Nous avons donc eu la répartition des tâches suivante :
Massi : Utilisation de R/Python pour le traitement des données pour la partie revenus fiscaux par département + réalisation du modèle ridge
Thomas : Utilisation de RMarkdown pour présenter les données au format HTML, réalisation du rapport
Houssem : Traitement des données pour la partie démographique
Alick : Utilisation de R/Python pour le traitement des données d’hospitalisation et de réanimation et faire les graphiques et du modèle de régression linéaire
Marjorie : Traitement des données pour la partie nombre d’habitants par département + rapport RMarkdown
Ensemble des membres : Récupération des sources de données externes
Sujet 2 : Modèles de régression et sélection de variables : Existe-t-il un lien entre des facteurs climatiques (température, pluviométrie, ensoleillement, vent,… ) et socio-économiques (revenu fiscal, age, …) et l’évolution du nombre d’hospitalisations (ou réanimations) à l’échelle du département?
Ce rapport contient les résultats d’analyse de données d’hospitalisation et réanimation de la crise du covid sur le plan national et départemental.
Pour entrer dans le vif du sujet, voici un graphique dynamique représentant le nombre de réanimation pour chaque département.
library(dplyr)
library(readxl)
library(ggplot2)
# read files
data_covid <- read.csv("https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7",
sep = ";", header = TRUE)
estim_pop_dep_sexe_gca_1975_2021 <- read_excel("C:/Users/alave/Downloads/data_NbHabitants_Dep.xls", sheet = "2021")
data_temp <- read.csv("https://www.data.gouv.fr/fr/datasets/r/dd0df06a-85f2-4621-8b8b-5a3fe195bcd7", sep = ";", header = TRUE)
data_pauvre <- read_excel("C:/Users/alave/Downloads/taux de pauvrete.xlsx", sheet = "DEP")
data_fisc <- read_excel("C:/Users/alave/Downloads/Revenus Fiscaux.xlsx", sheet = "DEP")
data_csp <- read_excel("C:/Users/alave/Downloads/csp.xlsx", sheet = "DEP")
data_obese <- read_excel("C:/Users/alave/Downloads/Obésité.xlsx", sheet = "DEP")
# number of peoples estimations -- data manipulation for join
colnames(estim_pop_dep_sexe_gca_1975_2021)[1] <- 'dep'
data_population <- subset(estim_pop_dep_sexe_gca_1975_2021,
select = c(dep, ...8, ...3, ...4, ...5, ...6, ...7))
data_population <-data.frame(data_population)
data_population <- data_population[c(5:nrow(data_population)),]
colnames(data_population)[2] <- 'nbr_pop'
data_population$...3 <- type.convert(data_population$...3, dec = ".")
data_population$...4 <- type.convert(data_population$...4, dec = ".")
data_population$...5 <- type.convert(data_population$...5, dec = ".")
data_population$...6 <- type.convert(data_population$...6, dec = ".")
data_population$...7 <- type.convert(data_population$...7, dec = ".")
data_population$nbr_pop <- type.convert(data_population$nbr_pop, dec = ".")
data_population["pourcentage_10_19ans"] <- (data_population$...3/data_population$nbr_pop)*100
data_population["pourcentage_20_39ans"] <- (data_population$...4/data_population$nbr_pop)*100
data_population["pourcentage_40_59ans"] <- (data_population$...5/data_population$nbr_pop)*100
data_population["pourcentage_60_74ans"] <- (data_population$...6/data_population$nbr_pop)*100
data_population["pourcentage_75_etplus"] <- (data_population$...7/data_population$nbr_pop)*100
data_population <- subset(data_population, select = -c(...3, ...4, ...5, ...6, ...7 ))
# temp data manipulation
data_temp <- subset(data_temp, select = c(date_obs, code_insee_departement, tmoy))
# pauvre data manipulation
colnames(data_pauvre)[1] <- 'dep'
data_pauvre <- data_pauvre[c(4:nrow(data_pauvre)),]
data_pauvre <- subset(data_pauvre, select = c(dep, ...3, ...4, ...5, ...6, ...7, ...8, ...9))
data_pauvre$...3 <- type.convert(data_pauvre$...3, dec = ".")
data_pauvre$...4 <- type.convert(data_pauvre$...4, dec = ".")
data_pauvre$...5 <- type.convert(data_pauvre$...5, dec = ".")
data_pauvre$...6 <- type.convert(data_pauvre$...6, dec = ".")
data_pauvre$...7 <- type.convert(data_pauvre$...7, dec = ".")
data_pauvre$...8 <- type.convert(data_pauvre$...8, dec = ".")
data_pauvre$...9 <- type.convert(data_pauvre$...9, dec = ".")
colnames(data_pauvre)[2] <- 'taux_pauvrete_moy'
colnames(data_pauvre)[3] <- 'taux_pauvrete_30etmoins'
colnames(data_pauvre)[4] <- 'taux_pauvrete_30_39ans'
colnames(data_pauvre)[5] <- 'taux_pauvrete_40_49ans'
colnames(data_pauvre)[6] <- 'taux_pauvrete_50_59ans'
colnames(data_pauvre)[7] <- 'taux_pauvrete_60_74ans'
colnames(data_pauvre)[8] <- 'taux_pauvrete_75etplus'
# fisc data manipulation
colnames(data_fisc)[1] <- 'dep'
colnames(data_fisc)[5] <- 'med_revenu'
colnames(data_fisc)[4] <- 'pourcentage_menage_imposes'
data_fisc <- data_fisc[c(2:nrow(data_fisc)),]
data_fisc <- subset(data_fisc, select = c("dep", "med_revenu", "pourcentage_menage_imposes"))
data_fisc$med_revenu <- type.convert(data_fisc$med_revenu, dec = ".")
data_fisc$pourcentage_menage_imposes <- type.convert(data_fisc$pourcentage_menage_imposes, dec = ".")
# csp data manipulation
data_csp <- data_csp[c(3:nrow(data_csp)),]
data_csp <- subset(data_csp, select = -c(...2))
colnames(data_csp)[1] <- 'dep'
colnames(data_csp)[2] <- 'agriculteurs'
colnames(data_csp)[3] <- 'artisans'
colnames(data_csp)[4] <- 'cadres'
colnames(data_csp)[5] <- 'prof_ind'
colnames(data_csp)[6] <- 'employes'
colnames(data_csp)[7] <- 'ouvriers'
colnames(data_csp)[8] <- 'autres'
data_csp <- data_csp[-1, ]
data_csp$agriculteurs <- type.convert(data_csp$agriculteurs, dec = ".")
data_csp$artisans <- type.convert(data_csp$artisans, dec = ".")
data_csp$cadres <- type.convert(data_csp$cadres, dec = ".")
data_csp$prof_ind <- type.convert(data_csp$prof_ind, dec = ".")
data_csp$employes <- type.convert(data_csp$employes, dec = ".")
data_csp$ouvriers <- type.convert(data_csp$ouvriers, dec = ".")
data_csp$autres <- type.convert(data_csp$autres, dec = ".")
# obese data manipulation
data_obese <- data_obese[c(4:nrow(data_obese)),]
colnames(data_obese)[1] <- 'dep'
colnames(data_obese)[3] <- 'frequence_obesite'
data_obese <- subset(data_obese, select = c("dep", "frequence_obesite"))
data_obese$frequence_obesite <- type.convert(data_obese$frequence_obesite, dec = ".")
# join data_covid and data_population on number dep
data_covid <- subset(data_covid, sexe == 0)
data_population <-left_join(data_covid,data_population, by = "dep")
data_population["hosp_pop"] <- (data_population$hosp / as.integer(data_population$nbr_pop))*100000
data_population["rea_pop"] <- (data_population$rea / as.integer(data_population$nbr_pop)) * 100000
################################################################################################ CREATE DATAFRAME FOR REGRESSION
# INFO: PIC COVID DAY -- HOSP & REA, NB POPULATION, TEMP APRIL, TAUX PAUVRETE, REVENU_MED
data_for_regression <- subset(data_population,
select = c(dep, jour, hosp_pop, rea_pop, nbr_pop, pourcentage_10_19ans, pourcentage_20_39ans,
pourcentage_40_59ans, pourcentage_60_74ans, pourcentage_75_etplus))
# SELECT THE DAY
data_for_regression <- subset(data_for_regression, data_for_regression$jour == "2020-04-15")
data_hosp <- subset(data_population, select = c(dep, jour, hosp_pop))
data_hosp$jour <- as.Date(data_hosp$jour)
data_hosp <- subset(data_hosp, data_hosp$jour == as.Date("2020-04-08") | data_hosp$jour == as.Date("2020-04-05") |
data_hosp$jour == as.Date("2020-04-01") | data_hosp$jour == as.Date("2020-03-27"))
data_hosp<- reshape(data_hosp, direction = "wide", idvar = "dep", timevar = "jour")
data_rea <- subset(data_population, select = c(dep, jour, rea_pop))
data_rea$jour <- as.Date(data_rea$jour)
data_rea <- subset(data_rea, data_rea$jour == as.Date("2020-04-08") | data_rea$jour == as.Date("2020-04-05") |
data_rea$jour == as.Date("2020-04-01") | data_rea$jour == as.Date("2020-03-27"))
data_rea<- reshape(data_rea, direction = "wide", idvar = "dep", timevar = "jour")
# SELECT THE TEMPERATURE OF THE MONTH BEFORE AND THE MONTH OF THE SELECTED DAY
data_temp_for_regression <- data.frame(data_temp$code_insee_departement, data_temp$date_obs,data_temp$tmoy)
data_temp_for_regression$data_temp.tmoy <- type.convert(data_temp_for_regression$data_temp.tmoy, dec = ".")
colnames(data_temp_for_regression)[1] <- 'dep'
colnames(data_temp_for_regression)[2] <- 'jour'
colnames(data_temp_for_regression)[3] <- 'tmoy'
data_temp_for_regression$jour <- as.Date(data_temp_for_regression$jour)
data_temp_for_regression <- data_temp_for_regression[!is.na(data_temp_for_regression$tmoy),]
data_temp_for_regression["month"] <- format(data_temp_for_regression$jour, '%B-%Y')
data_temp_for_regression <- data_temp_for_regression[, c("dep", "tmoy", "month")]
data_temp_for_regression <- subset(data_temp_for_regression,
data_temp_for_regression$month == "mars-2020" | data_temp_for_regression$month == "avril-2020")
# PASS THESE MONTH BY COLUMNS
data_temp_for_regression<- reshape(aggregate(.~dep+month,data_temp_for_regression, mean),
direction = "wide", idvar = "dep", timevar = "month")
# JOIN WTH OTHER INFORMATIONS
data_for_regression <- left_join(data_for_regression, data_pauvre, by = "dep")
data_for_regression <- left_join(data_for_regression, data_fisc, by = "dep")
data_for_regression <- left_join(data_for_regression, data_obese, by = "dep")
data_for_regression <- left_join(data_for_regression, data_csp, by = "dep")
data_for_regression <- left_join(data_for_regression, data_temp_for_regression, by = "dep")
data_for_regression <- left_join(data_for_regression, data_hosp, by = "dep")
data_for_regression <- left_join(data_for_regression, data_rea, by = "dep")
#Retrieve dep & jour columns
data_for_regression <- subset(data_for_regression, select = -c(dep, jour))
data_for_regression <- data_for_regression[!is.na(data_for_regression$`tmoy.avril-2020`),]
data_for_regression[] <- lapply(data_for_regression, function(x) type.convert(as.numeric(x)))
data_for_regression <- as.data.frame(scale(data_for_regression))
##############################################################################################################################################
########################################################## REGRESSION LINEAIRE ###########################################################
##############################################################################################################################################
library(ggplot2)
library(plotly)
library(scales)
g_hosp=ggplot(data_population, aes(x = jour, y = hosp_pop, group = dep, color=dep))+geom_line()
ggplotly(g_hosp + labs(title = "Nombre de hospitalisations pour 100 000 habitants"), tooltip = c("jour","hosp_pop","dep"))
# Reproduce hosp plot and rea plot
g=ggplot(data_population, aes(x = jour, y = rea_pop, group = dep, color=dep))+geom_line()
ggplotly(g + labs(title = "Nombre de réanimations pour 100 000 habitants"), tooltip = c("jour","rea_pop","dep"))
L’objectif est de trouver s’il y a des liens entre ces variables et le nombre d’hospitalisation/réanimation dans le but de pouvoir par la suite prédire l’évolution ces deux nombres.
Pour cela nous avons d’abord étudié les corrélations de chaque variables avec les variables nombre d’hospitalisés et nombre de réanimation.
library(corrplot)
data_agri <- data_for_regression[, c(1,2,9)]
corelation_pauvrete=cor(data_agri, use ="complete.obs")
corelation_pauvrete
## hosp_pop rea_pop taux_pauvrete_moy
## hosp_pop 1.00000000 0.90642397 0.07012067
## rea_pop 0.90642397 1.00000000 0.06305053
## taux_pauvrete_moy 0.07012067 0.06305053 1.00000000
data_agri <- data_for_regression[, c(1,2,16)]
corelation_revenu=cor(data_agri, use ="complete.obs")
corelation_revenu
## hosp_pop rea_pop med_revenu
## hosp_pop 1.0000000 0.9064240 0.4432539
## rea_pop 0.9064240 1.0000000 0.5354672
## med_revenu 0.4432539 0.5354672 1.0000000
data_agri <- data_for_regression[, c(1,2,17)]
corelation_impot=cor(data_agri, use ="complete.obs")
corelation_impot
## hosp_pop rea_pop pourcentage_menage_imposes
## hosp_pop 1.0000000 0.9064240 0.5469651
## rea_pop 0.9064240 1.0000000 0.6080154
## pourcentage_menage_imposes 0.5469651 0.6080154 1.0000000
data_agri <- data_for_regression[, c(1,2,18)]
corelation_obese=cor(data_agri, use ="complete.obs")
corelation_obese
## hosp_pop rea_pop frequence_obesite
## hosp_pop 1.00000000 0.90642397 0.04067622
## rea_pop 0.90642397 1.00000000 -0.05313625
## frequence_obesite 0.04067622 -0.05313625 1.00000000
data_agri <- data_for_regression[, c(1,2,19)]
corelation_agri=cor(data_agri, use ="complete.obs")
corelation_agri
## hosp_pop rea_pop agriculteurs
## hosp_pop 1.0000000 0.9064240 -0.4373294
## rea_pop 0.9064240 1.0000000 -0.4678343
## agriculteurs -0.4373294 -0.4678343 1.0000000
data_agri <- data_for_regression[, c(1,2,20)]
corelation_artisan=cor(data_agri, use ="complete.obs")
corelation_artisan
## hosp_pop rea_pop artisans
## hosp_pop 1.0000000 0.9064240 -0.5527561
## rea_pop 0.9064240 1.0000000 -0.5316584
## artisans -0.5527561 -0.5316584 1.0000000
data_agri <- data_for_regression[, c(1,2,21)]
corelation_cadre=cor(data_agri, use ="complete.obs")
corelation_cadre
## hosp_pop rea_pop cadres
## hosp_pop 1.0000000 0.9064240 0.5380131
## rea_pop 0.9064240 1.0000000 0.6706368
## cadres 0.5380131 0.6706368 1.0000000
data_agri <- data_for_regression[, c(1,2,22)]
corelation_profint=cor(data_agri, use ="complete.obs")
corelation_profint
## hosp_pop rea_pop prof_ind
## hosp_pop 1.0000000 0.9064240 0.1494916
## rea_pop 0.9064240 1.0000000 0.2183703
## prof_ind 0.1494916 0.2183703 1.0000000
data_agri <- data_for_regression[, c(1,2,23)]
corelation_employe=cor(data_agri, use ="complete.obs")
corelation_employe
## hosp_pop rea_pop employes
## hosp_pop 1.0000000 0.9064240 -0.3260549
## rea_pop 0.9064240 1.0000000 -0.4134684
## employes -0.3260549 -0.4134684 1.0000000
data_agri <- data_for_regression[, c(1,2,24)]
corelation_ouvrier=cor(data_agri, use ="complete.obs")
corelation_ouvrier
## hosp_pop rea_pop ouvriers
## hosp_pop 1.0000000 0.9064240 -0.2531782
## rea_pop 0.9064240 1.0000000 -0.3956963
## ouvriers -0.2531782 -0.3956963 1.0000000
data_agri <- data_for_regression[, c(1,2,25)]
corelation_autre=cor(data_agri, use ="complete.obs")
corelation_autre
## hosp_pop rea_pop autres
## hosp_pop 1.0000000 0.9064240 0.3292375
## rea_pop 0.9064240 1.0000000 0.3253975
## autres 0.3292375 0.3253975 1.0000000
data_agri <- data_for_regression[, c(1,2,26)]
corelation_tempAvril=cor(data_agri, use ="complete.obs")
corelation_tempAvril
## hosp_pop rea_pop tmoy.avril-2020
## hosp_pop 1.00000000 0.90642397 0.01714096
## rea_pop 0.90642397 1.00000000 0.07733877
## tmoy.avril-2020 0.01714096 0.07733877 1.00000000
data_agri <- data_for_regression[, c(1,2,27)]
corelation_tempMars=cor(data_agri, use ="complete.obs")
corelation_tempMars
## hosp_pop rea_pop tmoy.mars-2020
## hosp_pop 1.0000000 0.9064240 -0.3307346
## rea_pop 0.9064240 1.0000000 -0.2759859
## tmoy.mars-2020 -0.3307346 -0.2759859 1.0000000
Nous pouvons remarquer que les corrélations entre les variables nombres à l’hôpital et en réanimation avec les variables taux de pauvreté moyen, taux d’obésité, températures moyennes au mois d’avril ne sont pas bonnes (corrélations très faibles).
Nous trouvons des variables qui semblent plutôt être corrélées avec le nombre de personnes hospitalisées et en réanimation :
Les personnes ayant une catégorie socio professionnelle de profession intermédiaire
Les personnes ayant une catégorie socio professionnelle d’employés et d’ouvriers, la corrélation est d’ailleurs négative
La variable sur les températures moyennes de mars semble aussi être corrélée négativement
Les variables qui semblent être correctement corrélées avec le nombre de personnes hospitalisées et en réanimation sont:
Le revenu médian, le pourcentage des ménages imposés et les personnes appartenant à la catégorie socio professionnelles ‘Cadres et profession intellectuelle supérieure’
Certaines variables sont corrélées négativement : les personnes appartenant à la catégorie socio professionnelles Agriculteurs, Artisans
Afin de confirmer (ou non) les corrélations trouvées nous allons maintenant faire une régression linéaire du nombre d’hospitalisés au pic de cas du premier confinement à savoir le 15/04/2020 sur l’ensemble des variables étudiées. Notre but est de savoir quelles variables a un impact sur le taux d’hospitalisation.
data_for_regression_lin_hosp <- subset(data_for_regression, select = -c(rea_pop))
modele_all <- lm(data_for_regression_lin_hosp$hosp_pop ~., data = data_for_regression_lin_hosp)
sm <- summary(modele_all)
sm
##
## Call:
## lm(formula = data_for_regression_lin_hosp$hosp_pop ~ ., data = data_for_regression_lin_hosp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51847 -0.07714 -0.00216 0.08120 0.52537
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.010e-16 1.594e-02 0.000 1.000000
## nbr_pop 2.884e-02 3.963e-02 0.728 0.469576
## pourcentage_10_19ans -1.137e-01 1.036e-01 -1.098 0.276430
## pourcentage_20_39ans -5.787e-02 1.176e-01 -0.492 0.624284
## pourcentage_40_59ans -7.737e-02 3.284e-02 -2.356 0.021596 *
## pourcentage_60_74ans -1.088e-01 1.663e-01 -0.654 0.515528
## pourcentage_75_etplus NA NA NA NA
## taux_pauvrete_moy 4.121e-01 7.411e-01 0.556 0.580102
## taux_pauvrete_30etmoins -5.244e-02 1.206e-01 -0.435 0.665213
## taux_pauvrete_30_39ans 1.594e-01 1.899e-01 0.839 0.404425
## taux_pauvrete_40_49ans -4.715e-01 3.104e-01 -1.519 0.133724
## taux_pauvrete_50_59ans -1.157e-01 1.756e-01 -0.659 0.512505
## taux_pauvrete_60_74ans 1.709e-01 1.981e-01 0.863 0.391547
## taux_pauvrete_75etplus -1.148e-01 1.004e-01 -1.143 0.257441
## med_revenu 9.019e-02 9.816e-02 0.919 0.361708
## pourcentage_menage_imposes -3.809e-02 8.229e-02 -0.463 0.644994
## frequence_obesite -6.116e-03 2.790e-02 -0.219 0.827215
## agriculteurs 7.408e-01 5.210e-01 1.422 0.160021
## artisans 6.049e-01 4.459e-01 1.357 0.179734
## cadres 2.339e+00 1.706e+00 1.371 0.175246
## prof_ind 7.943e-01 5.406e-01 1.469 0.146776
## employes 1.007e+00 6.966e-01 1.445 0.153423
## ouvriers 1.988e+00 1.387e+00 1.434 0.156540
## autres 1.719e-01 1.528e-01 1.125 0.264702
## `tmoy.avril-2020` 2.766e-02 3.783e-02 0.731 0.467437
## `tmoy.mars-2020` -2.694e-02 5.048e-02 -0.534 0.595382
## `hosp_pop.2020-03-27` -1.233e-01 1.108e-01 -1.113 0.270014
## `hosp_pop.2020-04-01` -3.887e-01 2.343e-01 -1.659 0.102116
## `hosp_pop.2020-04-05` 6.568e-01 3.302e-01 1.989 0.051019 .
## `hosp_pop.2020-04-08` 9.355e-01 2.353e-01 3.976 0.000183 ***
## `rea_pop.2020-03-27` 1.169e-01 8.643e-02 1.352 0.181238
## `rea_pop.2020-04-01` 2.779e-02 1.691e-01 0.164 0.870007
## `rea_pop.2020-04-05` -4.620e-01 2.192e-01 -2.107 0.039079 *
## `rea_pop.2020-04-08` 1.835e-01 1.751e-01 1.048 0.298624
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1562 on 63 degrees of freedom
## Multiple R-squared: 0.9838, Adjusted R-squared: 0.9756
## F-statistic: 119.8 on 32 and 63 DF, p-value: < 2.2e-16
index = sample(1:nrow(data_for_regression), 0.7*nrow(data_for_regression))
train = data_for_regression[index,] # Create the training data
test = data_for_regression[-index,] # Create the test data
eval_metrics = function(model, df, predictions, target){
resids = df[,target] - predictions
resids2 = resids**2
N = length(predictions)
r2 = summary(model)$r.squared
adj_r2 = summary(model)$adj.r.squared
adj_r2 #Adjusted R-squared
rmse = sqrt(sum(resids2)/N)#RMSE
data.frame(
RMSE = rmse,
Rsquare = adj_r2
)
}
modele_all_pred <- lm(hosp_pop ~., data = train)
predictions_hosp_train = predict(modele_all_pred, newdata = train)
results_lm_hosp_train <- eval_metrics(modele_all_pred, train, predictions_hosp_train, target = 'hosp_pop')
predictions_hosp_test = predict(modele_all_pred, newdata = test)
results_lm_hosp_test <- eval_metrics(modele_all_pred, test, predictions_hosp_test, target = 'hosp_pop')
modele_all_pred_rea <- lm(rea_pop ~., data = train)
predictions_rea_train = predict(modele_all_pred_rea, newdata = train)
results_lm_rea_train <- eval_metrics(modele_all_pred_rea, train, predictions_rea_train, target = 'rea_pop')
predictions_rea_test = predict(modele_all_pred_rea, newdata = test)
results_lm_rea_test <- eval_metrics(modele_all_pred_rea, test, predictions_rea_test, target = 'rea_pop')
Nous remarquons tout d’abord que le modele est bon. En effet, la valeur du R-squared nous indique que les variables choisies expliquent bien le nombre d’hospitalisations du 15/04/2020.
De plus, la variable nombre en réanimation et la variable nombre d’hospitalisation du 14/04/2020 ont des coefficients très inférieures à 0.05, on peut donc faciltement en déduire qu’elles ont un impact importants sur notre nombre d’hospitalisations du pic (ce qui est cohérent).
Nous ajoutons à cela des valeurs plutot accceptables pour les variables : pourcentage des 40 à 59 ans, taux de pauvrete des 60 à 74 ans, les températures moyennes du mois précédent et du mois en cours.
Nous utilisons un stepAIC qui va tester toutes les combinaisons de variables afin de sélectionner le meilleur modèle. On testera les résultats ensuite.
library(MASS)
step <- stepAIC(modele_all, direction = "both", trace = FALSE)
step
##
## Call:
## lm(formula = data_for_regression_lin_hosp$hosp_pop ~ pourcentage_40_59ans +
## taux_pauvrete_30_39ans + taux_pauvrete_40_49ans + taux_pauvrete_60_74ans +
## taux_pauvrete_75etplus + `hosp_pop.2020-04-01` + `hosp_pop.2020-04-05` +
## `hosp_pop.2020-04-08` + `rea_pop.2020-03-27` + `rea_pop.2020-04-05` +
## taux_pauvrete_50_59ans, data = data_for_regression_lin_hosp)
##
## Coefficients:
## (Intercept) pourcentage_40_59ans taux_pauvrete_30_39ans
## 1.534e-16 -5.575e-02 1.591e-01
## taux_pauvrete_40_49ans taux_pauvrete_60_74ans taux_pauvrete_75etplus
## -1.962e-01 1.902e-01 -5.111e-02
## `hosp_pop.2020-04-01` `hosp_pop.2020-04-05` `hosp_pop.2020-04-08`
## -4.859e-01 5.484e-01 1.019e+00
## `rea_pop.2020-03-27` `rea_pop.2020-04-05` taux_pauvrete_50_59ans
## 8.154e-02 -2.170e-01 -1.278e-01
#modele_after_step <- lm(formula = data_for_regression$hosp_pop ~ rea_pop + nbr_pop +
# pourcentage_10_19ans + pourcentage_40_59ans + taux_pauvrete_moy +
# taux_pauvrete_30etmoins + taux_pauvrete_30_39ans + taux_pauvrete_50_59ans +
# taux_pauvrete_60_74ans + taux_pauvrete_75etplus + frequence_obesite +
# artisans + prof_ind + employes + ouvriers + `tmoy.avril-2020` +
# `tmoy.mars-2020` + `hosp_pop.2020-04-05` + `hosp_pop.2020-04-07` +
# `hosp_pop.2020-04-09` + `hosp_pop.2020-04-10` + `hosp_pop.2020-04-14` +
# `rea_pop.2020-04-04` + `rea_pop.2020-04-05` + `rea_pop.2020-04-12` +
# `rea_pop.2020-04-13` + `hosp_pop.2020-04-06`, data = data_for_regression)
#sm_step <- summary(modele_after_step)
#sm_step
On remarque donc effectivement que ce modèle fonctionne très bien avec un R-squared très proche voire supérieur au modele contenant toutes les variables.
On peut donc en déduire une sélection de nos variables pour l’explication du nombre d’hospitalisations avec la regression linéaire.
Nous décidons d’effectuer également une selection de variable pour expliquer le nombre de reanimations.
data_for_regression_lin_rea <- subset(data_for_regression, select = -c(hosp_pop))
modele_rea <- lm(data_for_regression_lin_rea$rea_pop ~., data = data_for_regression_lin_rea)
sm_rea <- summary(modele_rea)
sm_rea
##
## Call:
## lm(formula = data_for_regression_lin_rea$rea_pop ~ ., data = data_for_regression_lin_rea)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39429 -0.08339 -0.01338 0.07862 0.26338
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.072e-16 1.555e-02 0.000 1.00000
## nbr_pop 4.698e-02 3.868e-02 1.215 0.22906
## pourcentage_10_19ans 6.358e-02 1.011e-01 0.629 0.53173
## pourcentage_20_39ans 2.186e-01 1.147e-01 1.905 0.06128 .
## pourcentage_40_59ans -3.712e-03 3.205e-02 -0.116 0.90817
## pourcentage_60_74ans 2.694e-01 1.623e-01 1.660 0.10196
## pourcentage_75_etplus NA NA NA NA
## taux_pauvrete_moy -1.054e-01 7.232e-01 -0.146 0.88460
## taux_pauvrete_30etmoins -4.469e-02 1.177e-01 -0.380 0.70544
## taux_pauvrete_30_39ans 4.199e-02 1.853e-01 0.227 0.82152
## taux_pauvrete_40_49ans 3.490e-01 3.029e-01 1.152 0.25351
## taux_pauvrete_50_59ans -1.533e-01 1.714e-01 -0.894 0.37455
## taux_pauvrete_60_74ans -3.016e-02 1.933e-01 -0.156 0.87652
## taux_pauvrete_75etplus 7.949e-02 9.802e-02 0.811 0.42047
## med_revenu 1.228e-01 9.580e-02 1.281 0.20473
## pourcentage_menage_imposes 1.357e-02 8.030e-02 0.169 0.86634
## frequence_obesite -7.519e-03 2.723e-02 -0.276 0.78335
## agriculteurs -2.979e-01 5.085e-01 -0.586 0.56007
## artisans -3.500e-01 4.352e-01 -0.804 0.42431
## cadres -1.154e+00 1.665e+00 -0.693 0.49103
## prof_ind -3.267e-01 5.276e-01 -0.619 0.53803
## employes -4.571e-01 6.798e-01 -0.672 0.50385
## ouvriers -8.794e-01 1.353e+00 -0.650 0.51811
## autres -1.136e-01 1.491e-01 -0.762 0.44896
## `tmoy.avril-2020` 3.074e-02 3.692e-02 0.833 0.40823
## `tmoy.mars-2020` 7.844e-03 4.926e-02 0.159 0.87399
## `hosp_pop.2020-03-27` 6.542e-02 1.081e-01 0.605 0.54723
## `hosp_pop.2020-04-01` -4.243e-01 2.287e-01 -1.856 0.06820 .
## `hosp_pop.2020-04-05` -3.197e-01 3.222e-01 -0.992 0.32496
## `hosp_pop.2020-04-08` 7.768e-01 2.296e-01 3.383 0.00124 **
## `rea_pop.2020-03-27` 2.914e-02 8.435e-02 0.345 0.73087
## `rea_pop.2020-04-01` -2.789e-02 1.650e-01 -0.169 0.86635
## `rea_pop.2020-04-05` 3.096e-02 2.140e-01 0.145 0.88541
## `rea_pop.2020-04-08` 7.736e-01 1.709e-01 4.527 2.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1524 on 63 degrees of freedom
## Multiple R-squared: 0.9846, Adjusted R-squared: 0.9768
## F-statistic: 125.9 on 32 and 63 DF, p-value: < 2.2e-16
step_rea <- stepAIC(modele_rea, direction = "both", trace = FALSE)
step_rea
##
## Call:
## lm(formula = data_for_regression_lin_rea$rea_pop ~ nbr_pop +
## pourcentage_10_19ans + pourcentage_20_39ans + pourcentage_60_74ans +
## taux_pauvrete_40_49ans + taux_pauvrete_50_59ans + taux_pauvrete_75etplus +
## med_revenu + artisans + autres + `tmoy.avril-2020` + `hosp_pop.2020-03-27` +
## `hosp_pop.2020-04-01` + `hosp_pop.2020-04-08` + `rea_pop.2020-04-08`,
## data = data_for_regression_lin_rea)
##
## Coefficients:
## (Intercept) nbr_pop pourcentage_10_19ans
## -1.421e-16 4.785e-02 1.043e-01
## pourcentage_20_39ans pourcentage_60_74ans taux_pauvrete_40_49ans
## 1.796e-01 2.944e-01 2.956e-01
## taux_pauvrete_50_59ans taux_pauvrete_75etplus med_revenu
## -2.123e-01 7.494e-02 8.683e-02
## artisans autres `tmoy.avril-2020`
## -7.253e-02 -8.093e-02 2.825e-02
## `hosp_pop.2020-03-27` `hosp_pop.2020-04-01` `hosp_pop.2020-04-08`
## 1.297e-01 -6.440e-01 6.249e-01
## `rea_pop.2020-04-08`
## 8.120e-01
#modele_after_step_rea <- lm(formula = data_for_regression$rea_pop ~ hosp_pop + pourcentage_10_19ans +
# taux_pauvrete_moy + taux_pauvrete_30etmoins + taux_pauvrete_60_74ans +
# employes + ouvriers + `hosp_pop.2020-04-03` + `hosp_pop.2020-04-07` +
# `hosp_pop.2020-04-08` + `hosp_pop.2020-04-12` + `hosp_pop.2020-04-14` +
# `rea_pop.2020-04-02` + `rea_pop.2020-04-03` + `rea_pop.2020-04-06` +
# `rea_pop.2020-04-10` + `rea_pop.2020-04-14` + `hosp_pop.2020-04-13` +
# autres, data = data_for_regression)
#sm_step_rea <- summary(modele_after_step_rea)
#sm_step_rea
On remarque donc effectivement que ce modèle aussi fonctionne très bien avec un R-squared très proche voire supérieur au modele contenant toutes les variables.
Nous utilisons maintenant à la ridge regression.
On commence d’abord par observer les coefficients de nos variables en fonction des log lambdas:
library(glmnet)
x_hosp = as.matrix(subset(data_for_regression, select = -c(hosp_pop, rea_pop)))
y_train_hosp = data_for_regression$hosp_pop
lambdas <- 10^seq(2, -3, by = -.1)
ridge2 <- glmnet(x_hosp, y_train_hosp, alpha = 0, lambda = lambdas, family = 'gaussian', standardize = FALSE)
plot(ridge2, xvar = "lambda")
On selectionne maintenant le lambda optimal:
# Cherche optimal lambdas
cv_ridge_hosp <- cv.glmnet(x_hosp, y_train_hosp, alpha = 0, lambda = lambdas)
optimal_lambda_hosp <- cv_ridge_hosp$lambda.min
optimal_lambda_hosp
## [1] 0.007943282
On observe les coefficients de chaque variable avec ce lambda.
Plus un coefficient est éloignée de 0 et plus il a un impact sur le modèle.
ridge_reg_hosp = glmnet(x_hosp, y_train_hosp, alpha = 0, family = 'gaussian', lambda = optimal_lambda_hosp)
ridge_reg_hosp$beta
## 33 x 1 sparse Matrix of class "dgCMatrix"
## s0
## nbr_pop 0.012756956
## pourcentage_10_19ans -0.038719463
## pourcentage_20_39ans 0.035909003
## pourcentage_40_59ans -0.030442408
## pourcentage_60_74ans -0.049850594
## pourcentage_75_etplus 0.063325877
## taux_pauvrete_moy 0.025609783
## taux_pauvrete_30etmoins -0.016625622
## taux_pauvrete_30_39ans 0.093467384
## taux_pauvrete_40_49ans -0.098861617
## taux_pauvrete_50_59ans -0.069442895
## taux_pauvrete_60_74ans 0.154942252
## taux_pauvrete_75etplus -0.087352704
## med_revenu 0.029293435
## pourcentage_menage_imposes 0.041238342
## frequence_obesite -0.005760535
## agriculteurs 0.034577334
## artisans -0.053288045
## cadres -0.021584064
## prof_ind 0.010513091
## employes 0.039643191
## ouvriers 0.028003040
## autres -0.003345314
## tmoy.avril-2020 0.013920438
## tmoy.mars-2020 -0.018610248
## hosp_pop.2020-03-27 -0.096705695
## hosp_pop.2020-04-01 -0.090628370
## hosp_pop.2020-04-05 0.434712536
## hosp_pop.2020-04-08 0.759943069
## rea_pop.2020-03-27 0.121769703
## rea_pop.2020-04-01 -0.142877316
## rea_pop.2020-04-05 -0.158007918
## rea_pop.2020-04-08 0.068335639
index = sample(1:nrow(data_for_regression), 0.7*nrow(data_for_regression))
train = data_for_regression[index,] # Create the training data
x_train_hosp <- as.matrix(subset(train, select = -c(hosp_pop, rea_pop)))
y_train_pred_hosp <- train$hosp_pop
test = data_for_regression[-index,] # Create the test data
x_test_hosp <- as.matrix(subset(test, select = -c(hosp_pop, rea_pop)))
y_test_hosp <- test$hosp_pop
ridge_reg_hosp_pred <- glmnet(x_train_hosp, y_train_pred_hosp, alpha = 0, family = 'gaussian', lambda = optimal_lambda_hosp)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
Prediction Train :
# Prediction and evaluation on train data
predictions_train_hosp <- predict(ridge_reg_hosp_pred, s = optimal_lambda_hosp, newx = x_train_hosp)
results_hosp_ridge_train <- eval_results(y_train_pred_hosp, predictions_train_hosp, train)
results_hosp_ridge_train
## RMSE Rsquare
## 1 0.1397156 0.979581
Prediction Test :
# Prediction and evaluation on test data
predictions_test_hosp <- predict(ridge_reg_hosp_pred, s = optimal_lambda_hosp, newx = x_test_hosp)
results_hosp_ridge_test <- eval_results(y_test_hosp, predictions_test_hosp, test)
results_hosp_ridge_test
## RMSE Rsquare
## 1 0.1769848 0.9706271
On remarque que les résultats de la prédiction sont bons. On arrive donc à prédire le pic d’hospitalisation du premier confinement par les variables que nous avons sélectionnés.
On commence d’abord par observer les coefficients de nos variables en fonction des log lambdas:
library(glmnet)
x = as.matrix(subset(data_for_regression, select = -c(rea_pop, hosp_pop)))
y_train = data_for_regression$rea_pop
lambdas <- 10^seq(2, -3, by = -.1)
ridge_rea <- glmnet(x, y_train, alpha = 0, lambda = lambdas, family = 'gaussian', standardize = FALSE)
plot(ridge_rea, xvar = "lambda")
On selectionne maintenant le lambda optimal:
# Cherche optimal lambdas
cv_ridge <- cv.glmnet(x, y_train, alpha = 0, lambda = lambdas)
optimal_lambda <- cv_ridge$lambda.min
optimal_lambda
## [1] 0.003981072
On obtient un lambda optimal. On peut l’observer sur le graphique avec log(0.001) = -3.
Les coefficients de chaque variable sont étudiés avec ce lambda.
Plus un coefficient est éloignée de 0 et plus il a un impact sur le modèle.
ridge_reg = glmnet(x, y_train, alpha = 0, family = 'gaussian', lambda = optimal_lambda)
ridge_reg$beta
## 33 x 1 sparse Matrix of class "dgCMatrix"
## s0
## nbr_pop 0.044665977
## pourcentage_10_19ans -0.082805433
## pourcentage_20_39ans 0.049360367
## pourcentage_40_59ans -0.033532103
## pourcentage_60_74ans 0.096830084
## pourcentage_75_etplus -0.096828841
## taux_pauvrete_moy 0.050053841
## taux_pauvrete_30etmoins -0.094395884
## taux_pauvrete_30_39ans 0.103764835
## taux_pauvrete_40_49ans 0.155792933
## taux_pauvrete_50_59ans -0.083065482
## taux_pauvrete_60_74ans -0.073659896
## taux_pauvrete_75etplus 0.060940376
## med_revenu 0.089201514
## pourcentage_menage_imposes 0.036611771
## frequence_obesite -0.014486088
## agriculteurs 0.036920784
## artisans -0.075513557
## cadres -0.036853102
## prof_ind 0.025654803
## employes -0.007375269
## ouvriers 0.013740803
## autres -0.016162740
## tmoy.avril-2020 0.033019591
## tmoy.mars-2020 -0.004311665
## hosp_pop.2020-03-27 0.013605261
## hosp_pop.2020-04-01 -0.331985571
## hosp_pop.2020-04-05 -0.100253218
## hosp_pop.2020-04-08 0.527272856
## rea_pop.2020-03-27 0.054527891
## rea_pop.2020-04-01 -0.117554277
## rea_pop.2020-04-05 0.113088651
## rea_pop.2020-04-08 0.733210131
x_train_rea <- as.matrix(subset(train, select = -c(rea_pop, hosp_pop)))
y_train_pred_rea <- train$rea_pop
x_test_rea <- as.matrix(subset(test, select = -c(rea_pop, hosp_pop)))
y_test_rea <- test$rea_pop
ridge_reg_rea_pred <- glmnet(x_train_rea, y_train_pred_rea, alpha = 0, family = 'gaussian', lambda = optimal_lambda)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
Prediction Train :
# Prediction and evaluation on train data
predictions_train_rea <- predict(ridge_reg_rea_pred, s = optimal_lambda, newx = x_train_rea)
results_rea_ridge_train <- eval_results(y_train_pred_rea, predictions_train_rea, train)
results_rea_ridge_train
## RMSE Rsquare
## 1 0.1250255 0.9842947
Prediction Test :
# Prediction and evaluation on test data
predictions_test_rea <- predict(ridge_reg_rea_pred, s = optimal_lambda, newx = x_test_rea)
results_rea_ridge_test <- eval_results(y_test_rea, predictions_test_rea, test)
results_rea_ridge_test
## RMSE Rsquare
## 1 0.1784203 0.9673894
Nous affichons le lambda optimal pour cette regression et nous observons les coefficients associés aux variables:
cv_lasso_hosp <- cv.glmnet(x_hosp, y_train_hosp, alpha = 1, lambda = lambdas)
optimal_lambda_lasso_hosp <- cv_lasso_hosp$lambda.min
optimal_lambda_lasso_hosp
## [1] 0.006309573
lasso_reg <- glmnet(x_hosp, y_train_hosp, alpha = 1, standardize = FALSE, lambda = optimal_lambda_lasso_hosp)
lasso_reg$beta
## 33 x 1 sparse Matrix of class "dgCMatrix"
## s0
## nbr_pop .
## pourcentage_10_19ans .
## pourcentage_20_39ans .
## pourcentage_40_59ans -0.017593004
## pourcentage_60_74ans .
## pourcentage_75_etplus .
## taux_pauvrete_moy .
## taux_pauvrete_30etmoins .
## taux_pauvrete_30_39ans .
## taux_pauvrete_40_49ans .
## taux_pauvrete_50_59ans .
## taux_pauvrete_60_74ans 0.009469029
## taux_pauvrete_75etplus -0.009621545
## med_revenu .
## pourcentage_menage_imposes 0.018398826
## frequence_obesite 0.009477722
## agriculteurs .
## artisans -0.031270920
## cadres .
## prof_ind -0.003969117
## employes .
## ouvriers .
## autres .
## tmoy.avril-2020 .
## tmoy.mars-2020 .
## hosp_pop.2020-03-27 -0.034290737
## hosp_pop.2020-04-01 .
## hosp_pop.2020-04-05 .
## hosp_pop.2020-04-08 1.044647996
## rea_pop.2020-03-27 .
## rea_pop.2020-04-01 -0.071673787
## rea_pop.2020-04-05 .
## rea_pop.2020-04-08 .
lasso_reg_hosp_pred <- glmnet(x_train_hosp, y_train_pred_hosp, alpha = 1, standardize = FALSE, lambda = optimal_lambda_lasso_hosp)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
Prediction Train :
# Prediction and evaluation on train data
predictions_lasso_hosp <- predict(lasso_reg_hosp_pred, s = optimal_lambda_lasso_hosp, newx = x_train_hosp)
results_hosp_lasso_train <- eval_results(y_train_pred_hosp, predictions_lasso_hosp, train)
results_hosp_lasso_train
## RMSE Rsquare
## 1 0.1678551 0.9705277
Prediction Test :
# Prediction and evaluation on test data
predictions_test_lasso_hosp <- predict(lasso_reg_hosp_pred, s = optimal_lambda_lasso_hosp, newx = x_test_hosp)
results_hosp_lasso_test <- eval_results(y_test_hosp, predictions_test_lasso_hosp, test)
results_hosp_lasso_test
## RMSE Rsquare
## 1 0.1554564 0.9773383
Nous affichons le lambda optimal pour cette regression et nous observons les coefficients associés aux variables:
cv_lasso_rea <- cv.glmnet(x_hosp, y_train, alpha = 1, lambda = lambdas)
optimal_lambda_lasso_rea <- cv_lasso_rea$lambda.min
optimal_lambda_lasso_rea
## [1] 0.003162278
lasso_reg_rea <- glmnet(x, y_train, alpha = 1, standardize = FALSE, lambda = optimal_lambda_lasso_rea)
lasso_reg_rea$beta
## 33 x 1 sparse Matrix of class "dgCMatrix"
## s0
## nbr_pop 0.046934203
## pourcentage_10_19ans -0.043932059
## pourcentage_20_39ans 0.009809057
## pourcentage_40_59ans -0.002271553
## pourcentage_60_74ans .
## pourcentage_75_etplus -0.001630267
## taux_pauvrete_moy .
## taux_pauvrete_30etmoins -0.022755098
## taux_pauvrete_30_39ans 0.030739584
## taux_pauvrete_40_49ans 0.031492337
## taux_pauvrete_50_59ans .
## taux_pauvrete_60_74ans .
## taux_pauvrete_75etplus .
## med_revenu .
## pourcentage_menage_imposes 0.024655109
## frequence_obesite .
## agriculteurs 0.025178225
## artisans -0.041551843
## cadres .
## prof_ind .
## employes .
## ouvriers .
## autres .
## tmoy.avril-2020 0.027777759
## tmoy.mars-2020 -0.024754808
## hosp_pop.2020-03-27 .
## hosp_pop.2020-04-01 -0.317429240
## hosp_pop.2020-04-05 .
## hosp_pop.2020-04-08 0.435927878
## rea_pop.2020-03-27 0.002898510
## rea_pop.2020-04-01 .
## rea_pop.2020-04-05 .
## rea_pop.2020-04-08 0.805157014
lasso_reg_rea_pred <- glmnet(x_train_rea, y_train_pred_rea, alpha = 1, standardize = FALSE, lambda = optimal_lambda_lasso_rea)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
Prediction Train :
# Prediction and evaluation on train data
predictions_lasso_rea <- predict(lasso_reg_rea_pred, s = optimal_lambda_lasso_rea, newx = x_train_rea)
results_rea_lasso_train <- eval_results(y_train_pred_rea, predictions_lasso_rea, train)
results_rea_lasso_train
## RMSE Rsquare
## 1 0.143576 0.9792885
Prediction Test :
# Prediction and evaluation on test data
predictions_test_lasso_rea <- predict(lasso_reg_rea_pred, s = optimal_lambda_lasso_rea, newx = x_test_rea)
results_rea_lasso_test <- eval_results(y_test_rea, predictions_test_lasso_rea, test)
results_rea_lasso_test
## RMSE Rsquare
## 1 0.1448191 0.9785156
library(tidyverse)
library(caret)
library(glmnet)
elastic <- train(x_train_hosp, y_train_pred_hosp, method = "glmnet",trControl = trainControl("cv", number = 10),tuneLength = 10)
# Model coefficients
coef(elastic$finalModel, elastic$bestTune$lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.008926764
## nbr_pop .
## pourcentage_10_19ans .
## pourcentage_20_39ans .
## pourcentage_40_59ans -0.004900447
## pourcentage_60_74ans .
## pourcentage_75_etplus .
## taux_pauvrete_moy .
## taux_pauvrete_30etmoins .
## taux_pauvrete_30_39ans .
## taux_pauvrete_40_49ans .
## taux_pauvrete_50_59ans .
## taux_pauvrete_60_74ans .
## taux_pauvrete_75etplus .
## med_revenu .
## pourcentage_menage_imposes .
## frequence_obesite .
## agriculteurs .
## artisans -0.057353457
## cadres .
## prof_ind .
## employes .
## ouvriers .
## autres .
## tmoy.avril-2020 .
## tmoy.mars-2020 .
## hosp_pop.2020-03-27 .
## hosp_pop.2020-04-01 .
## hosp_pop.2020-04-05 .
## hosp_pop.2020-04-08 0.902882821
## rea_pop.2020-03-27 .
## rea_pop.2020-04-01 .
## rea_pop.2020-04-05 .
## rea_pop.2020-04-08 .
predictions_train_elasc <- predict(elastic, x_train_hosp)
results_hosp_elastic_train <- eval_results(y_train_pred_hosp, predictions_train_elasc, train)
results_hosp_elastic_train
## RMSE Rsquare
## 1 0.1858727 0.963861
predictions_test <- predict(elastic, x_test_hosp)
results_hosp_elastic_test <- eval_results(y_test_hosp, predictions_test, test)
results_hosp_elastic_test
## RMSE Rsquare
## 1 0.167666 0.9736388
elastic_rea <- train(x_train_rea, y_train_pred_rea, method = "glmnet",trControl = trainControl("cv", number = 10),tuneLength = 10)
# Model coefficients
coef(elastic_rea$finalModel, elastic_rea$bestTune$lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.0009517152
## nbr_pop 0.0622094241
## pourcentage_10_19ans -0.0307762183
## pourcentage_20_39ans .
## pourcentage_40_59ans .
## pourcentage_60_74ans .
## pourcentage_75_etplus .
## taux_pauvrete_moy .
## taux_pauvrete_30etmoins -0.0095802884
## taux_pauvrete_30_39ans 0.0001116671
## taux_pauvrete_40_49ans 0.0689184944
## taux_pauvrete_50_59ans .
## taux_pauvrete_60_74ans .
## taux_pauvrete_75etplus 0.0034043882
## med_revenu .
## pourcentage_menage_imposes 0.0494862621
## frequence_obesite -0.0013427487
## agriculteurs 0.0321839332
## artisans -0.0125968158
## cadres .
## prof_ind -0.0061670906
## employes -0.0184621633
## ouvriers .
## autres .
## tmoy.avril-2020 0.0200253764
## tmoy.mars-2020 -0.0429287237
## hosp_pop.2020-03-27 .
## hosp_pop.2020-04-01 -0.2103274077
## hosp_pop.2020-04-05 .
## hosp_pop.2020-04-08 0.3393742308
## rea_pop.2020-03-27 .
## rea_pop.2020-04-01 .
## rea_pop.2020-04-05 .
## rea_pop.2020-04-08 0.7664362667
predictions_train_elasc_rea <- predict(elastic_rea, x_train_rea)
results_hosp_elastic_train_rea <- eval_results(y_train_pred_rea, predictions_train_elasc_rea, train)
results_hosp_elastic_train_rea
## RMSE Rsquare
## 1 0.1540967 0.976142
predictions_test_rea <- predict(elastic_rea, x_test_rea)
results_hosp_elastic_test_rea <- eval_results(y_test_rea, predictions_test_rea, test)
results_hosp_elastic_test_rea
## RMSE Rsquare
## 1 0.1484219 0.9774334
Avant de conclure sur la selection de variables explicatives, on remarque d’abord que les modèles que nous avons effectuées ont bien fonctionnés. Ce qui signifie que nos variables expliquent bien le nombre d’hospitalisations.
results_rmse_train <- c(results_hosp_ridge_train$RMSE, results_hosp_lasso_train$RMSE, results_lm_hosp_train$RMSE, results_hosp_elastic_train$RMSE)
results_rmse_test <- c(results_hosp_ridge_test$RMSE, results_hosp_lasso_test$RMSE, results_lm_hosp_test$RMSE, results_hosp_elastic_test$RMSE)
results_rsquare_train <- c(results_hosp_ridge_train$Rsquare, results_hosp_lasso_train$Rsquare, results_lm_hosp_train$Rsquare, results_hosp_elastic_train$Rsquare)
results_rsquare_test <- c(results_hosp_ridge_test$Rsquare, results_hosp_lasso_test$Rsquare, results_lm_hosp_test$Rsquare, results_hosp_elastic_test$Rsquare)
list_results <- list(results_rmse_train, results_rsquare_train, results_rmse_test, results_rsquare_test)
result <- data.frame(list_results)
colnames(result) <- c("RMSE train","Rsquare Train","RMSE test", "Rsquare test")
rownames(result) <- c("ridge","lasso", "Reg lineaire", "Elastic net")
result
## RMSE train Rsquare Train RMSE test Rsquare test
## ridge 0.1397156 0.9795810 0.1769848 0.9706271
## lasso 0.1678551 0.9705277 0.1554564 0.9773383
## Reg lineaire 0.1315996 0.9682795 0.2510584 0.9682795
## Elastic net 0.1858727 0.9638610 0.1676660 0.9736388
La régression ridge nous permet d’estimer les coefficients de la regression linéaire en prenant pour Y le nombre d’hospitalisation du pic du covid en avril 2020. Nous rappelons que plus un coefficient est faible et moins il est utile dans notre prédiction.
C’est pourquoi nous avons décider de sélectionner les variables dont le coefficient est supérieur à 0.01.
summary_ridge_hosp = summary(ridge_reg_hosp$beta)
names_ridge_hosp = rownames(ridge_reg_hosp$beta)
data_ridge_hosp = as.data.frame(summary_ridge_hosp$x,names_ridge_hosp)
colnames(data_ridge_hosp)[1] <- 'coef'
data_ridge_hosp <- subset(data_ridge_hosp, abs(data_ridge_hosp$coef) > 0.05)
data_ridge_hosp
## coef
## pourcentage_75_etplus 0.06332588
## taux_pauvrete_30_39ans 0.09346738
## taux_pauvrete_40_49ans -0.09886162
## taux_pauvrete_50_59ans -0.06944290
## taux_pauvrete_60_74ans 0.15494225
## taux_pauvrete_75etplus -0.08735270
## artisans -0.05328804
## hosp_pop.2020-03-27 -0.09670569
## hosp_pop.2020-04-01 -0.09062837
## hosp_pop.2020-04-05 0.43471254
## hosp_pop.2020-04-08 0.75994307
## rea_pop.2020-03-27 0.12176970
## rea_pop.2020-04-01 -0.14287732
## rea_pop.2020-04-05 -0.15800792
## rea_pop.2020-04-08 0.06833564
summary_lasso_hosp = summary(lasso_reg$beta)
names_lasso = rownames(lasso_reg$beta)
i = 1
names_list_lasso = list()
for (elt in summary_lasso_hosp$i){
names_lasso[elt] -> names_list_lasso[i]
i = i + 1
}
data_lasso_hosp = as.data.frame(x=summary_lasso_hosp$x,unlist(names_list_lasso))
colnames(data_lasso_hosp)[1] <- 'coef'
data_lasso_hosp
## coef
## pourcentage_40_59ans -0.017593004
## taux_pauvrete_60_74ans 0.009469029
## taux_pauvrete_75etplus -0.009621545
## pourcentage_menage_imposes 0.018398826
## frequence_obesite 0.009477722
## artisans -0.031270920
## prof_ind -0.003969117
## hosp_pop.2020-03-27 -0.034290737
## hosp_pop.2020-04-08 1.044647996
## rea_pop.2020-04-01 -0.071673787
summary_elastic <- summary(coef(elastic$finalModel, elastic$bestTune$lambda))
names_elastic <- rownames(coef(elastic$finalModel, elastic$bestTune$lambda))
i = 1
names_list_elastic = list()
for (elt in summary_elastic$i){
names_elastic[elt] -> names_list_elastic[i]
i = i + 1
}
data_elastic_hosp = as.data.frame(summary_elastic$x,unlist(names_list_elastic))
colnames(data_elastic_hosp)[1] <- 'coef'
data_elastic_hosp
## coef
## (Intercept) -0.008926764
## pourcentage_40_59ans -0.004900447
## artisans -0.057353457
## hosp_pop.2020-04-08 0.902882821
De même pour la réanimation du pic du covid, les modèles que nous avons effectués ont bien fonctionnés. Ce qui signifie que nos variables expliquent bien le nombre de reanimations.
results_rmse_train_rea <- c(results_rea_ridge_train$RMSE, results_rea_lasso_train$RMSE, results_lm_rea_train$RMSE, results_hosp_elastic_train_rea$RMSE)
results_rmse_test_rea <- c(results_rea_ridge_test$RMSE, results_rea_lasso_test$RMSE, results_lm_rea_test$RMSE, results_hosp_elastic_test_rea$RMSE)
results_rsquare_train_rea <- c(results_rea_ridge_train$Rsquare, results_rea_lasso_train$Rsquare, results_lm_rea_train$Rsquare, results_hosp_elastic_train_rea$Rsquare)
results_rsquare_test_rea <- c(results_rea_ridge_test$Rsquare, results_rea_lasso_test$Rsquare, results_lm_rea_test$Rsquare, results_hosp_elastic_test_rea$Rsquare)
list_results_rea <- list(results_rmse_train_rea, results_rsquare_train_rea, results_rmse_test_rea, results_rsquare_test_rea)
result_rea <- data.frame(list_results_rea)
colnames(result_rea) <- c("RMSE train","Rsquare Train","RMSE test", "Rsquare test")
rownames(result_rea) <- c("ridge","lasso", "Reg lineaire", "elastic net")
result_rea
## RMSE train Rsquare Train RMSE test Rsquare test
## ridge 0.12502553 0.9842947 0.1784203 0.9673894
## lasso 0.14357595 0.9792885 0.1448191 0.9785156
## Reg lineaire 0.09520641 0.9845756 0.3237400 0.9845756
## elastic net 0.15409665 0.9761420 0.1484219 0.9774334
La régression ridge nous permet d’estimer les coefficients de la regression linéaire en prenant pour Y le nombre de réanimations du pic du covid en avril 2020. Nous rappelons que plus un coefficient est faible et moins il est utile dans notre prédiction.
C’est pourquoi nous avons décider de sélectionner les variables dont le coefficient est supérieur à 0.05.
summary_ridge_rea = summary(ridge_reg$beta)
names_ridge_rea = rownames(ridge_reg$beta)
data_ridge_rea = as.data.frame(summary_ridge_rea$x,names_ridge_rea)
colnames(data_ridge_rea)[1] <- 'coef'
data_ridge_rea <- subset(data_ridge_rea, abs(data_ridge_rea$coef) > 0.05)
data_ridge_rea
## coef
## pourcentage_10_19ans -0.08280543
## pourcentage_60_74ans 0.09683008
## pourcentage_75_etplus -0.09682884
## taux_pauvrete_moy 0.05005384
## taux_pauvrete_30etmoins -0.09439588
## taux_pauvrete_30_39ans 0.10376484
## taux_pauvrete_40_49ans 0.15579293
## taux_pauvrete_50_59ans -0.08306548
## taux_pauvrete_60_74ans -0.07365990
## taux_pauvrete_75etplus 0.06094038
## med_revenu 0.08920151
## artisans -0.07551356
## hosp_pop.2020-04-01 -0.33198557
## hosp_pop.2020-04-05 -0.10025322
## hosp_pop.2020-04-08 0.52727286
## rea_pop.2020-03-27 0.05452789
## rea_pop.2020-04-01 -0.11755428
## rea_pop.2020-04-05 0.11308865
## rea_pop.2020-04-08 0.73321013
summary_lasso_rea = summary(lasso_reg_rea$beta)
names_lasso = rownames(lasso_reg_rea$beta)
i = 1
names_list_lasso = list()
for (elt in summary_lasso_rea$i){
names_lasso[elt] -> names_list_lasso[i]
i = i + 1
}
data_lasso_rea = as.data.frame(x=summary_lasso_rea$x,unlist(names_list_lasso))
colnames(data_lasso_rea)[1] <- 'coef'
data_lasso_rea
## coef
## nbr_pop 0.046934203
## pourcentage_10_19ans -0.043932059
## pourcentage_20_39ans 0.009809057
## pourcentage_40_59ans -0.002271553
## pourcentage_75_etplus -0.001630267
## taux_pauvrete_30etmoins -0.022755098
## taux_pauvrete_30_39ans 0.030739584
## taux_pauvrete_40_49ans 0.031492337
## pourcentage_menage_imposes 0.024655109
## agriculteurs 0.025178225
## artisans -0.041551843
## tmoy.avril-2020 0.027777759
## tmoy.mars-2020 -0.024754808
## hosp_pop.2020-04-01 -0.317429240
## hosp_pop.2020-04-08 0.435927878
## rea_pop.2020-03-27 0.002898510
## rea_pop.2020-04-08 0.805157014
summary_elastic <- summary(coef(elastic_rea$finalModel, elastic_rea$bestTune$lambda))
names_elastic <- rownames(coef(elastic_rea$finalModel, elastic_rea$bestTune$lambda))
i = 1
names_list_elastic = list()
for (elt in summary_elastic$i){
names_elastic[elt] -> names_list_elastic[i]
i = i + 1
}
data_elastic_rea = as.data.frame(summary_elastic$x,unlist(names_list_elastic))
colnames(data_elastic_rea)[1] <- 'coef'
data_elastic_rea
## coef
## (Intercept) -0.0009517152
## nbr_pop 0.0622094241
## pourcentage_10_19ans -0.0307762183
## taux_pauvrete_30etmoins -0.0095802884
## taux_pauvrete_30_39ans 0.0001116671
## taux_pauvrete_40_49ans 0.0689184944
## taux_pauvrete_75etplus 0.0034043882
## pourcentage_menage_imposes 0.0494862621
## frequence_obesite -0.0013427487
## agriculteurs 0.0321839332
## artisans -0.0125968158
## prof_ind -0.0061670906
## employes -0.0184621633
## tmoy.avril-2020 0.0200253764
## tmoy.mars-2020 -0.0429287237
## hosp_pop.2020-04-01 -0.2103274077
## hosp_pop.2020-04-08 0.3393742308
## rea_pop.2020-04-08 0.7664362667